# import libraries
from tensorflow import keras
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import math
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator
from sklearn.metrics import f1_score, precision_score, recall_score
import warnings
warnings.filterwarnings("ignore")
import plotly.graph_objects as go
np.random.seed(1)
tf.random.set_seed(1)
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout, RepeatVector, TimeDistributed, Bidirectional
huwaei=pd.read_csv('training_1000.csv')
huwaei['timestamp']=pd.to_datetime(huwaei['timestamp'])
huwaei.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 20159 entries, 0 to 20158 Data columns (total 4 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 timestamp 20159 non-null datetime64[ns, pytz.FixedOffset(120)] 1 kpi_value 20159 non-null float64 2 request_count 20159 non-null int64 3 anomaly_label 20159 non-null int64 dtypes: datetime64[ns, pytz.FixedOffset(120)](1), float64(1), int64(2) memory usage: 630.1 KB
huwaei.head()
| timestamp | kpi_value | request_count | anomaly_label | |
|---|---|---|---|---|
| 0 | 2020-08-14 02:00:00+02:00 | 0.998755 | 24908 | 0 |
| 1 | 2020-08-14 02:01:00+02:00 | 0.997683 | 25029 | 0 |
| 2 | 2020-08-14 02:02:00+02:00 | 0.998341 | 24115 | 0 |
| 3 | 2020-08-14 02:03:00+02:00 | 0.998211 | 24031 | 0 |
| 4 | 2020-08-14 02:04:00+02:00 | 0.998403 | 23790 | 0 |
huwaei.tail()
| timestamp | kpi_value | request_count | anomaly_label | |
|---|---|---|---|---|
| 20154 | 2020-08-28 01:56:00+02:00 | 0.998149 | 26467 | 0 |
| 20155 | 2020-08-28 01:57:00+02:00 | 0.998340 | 26502 | 0 |
| 20156 | 2020-08-28 01:58:00+02:00 | 0.998364 | 26887 | 0 |
| 20157 | 2020-08-28 01:59:00+02:00 | 0.998428 | 26712 | 0 |
| 20158 | 2020-08-28 02:00:00+02:00 | 0.997407 | 29694 | 0 |
huwaei.describe()
| kpi_value | request_count | anomaly_label | |
|---|---|---|---|
| count | 20159.000000 | 20159.000000 | 20159.000000 |
| mean | 0.992995 | 13648.733419 | 0.034228 |
| std | 0.004018 | 6545.602482 | 0.181819 |
| min | 0.951782 | 3928.000000 | 0.000000 |
| 25% | 0.989950 | 8535.000000 | 0.000000 |
| 50% | 0.992644 | 10325.000000 | 0.000000 |
| 75% | 0.996622 | 19443.000000 | 0.000000 |
| max | 0.999438 | 76247.000000 | 1.000000 |
huwaei.dropna(how="any",inplace=False)
| timestamp | kpi_value | request_count | anomaly_label | |
|---|---|---|---|---|
| 0 | 2020-08-14 02:00:00+02:00 | 0.998755 | 24908 | 0 |
| 1 | 2020-08-14 02:01:00+02:00 | 0.997683 | 25029 | 0 |
| 2 | 2020-08-14 02:02:00+02:00 | 0.998341 | 24115 | 0 |
| 3 | 2020-08-14 02:03:00+02:00 | 0.998211 | 24031 | 0 |
| 4 | 2020-08-14 02:04:00+02:00 | 0.998403 | 23790 | 0 |
| ... | ... | ... | ... | ... |
| 20154 | 2020-08-28 01:56:00+02:00 | 0.998149 | 26467 | 0 |
| 20155 | 2020-08-28 01:57:00+02:00 | 0.998340 | 26502 | 0 |
| 20156 | 2020-08-28 01:58:00+02:00 | 0.998364 | 26887 | 0 |
| 20157 | 2020-08-28 01:59:00+02:00 | 0.998428 | 26712 | 0 |
| 20158 | 2020-08-28 02:00:00+02:00 | 0.997407 | 29694 | 0 |
20159 rows × 4 columns
huwaei.isnull().sum()
timestamp 0 kpi_value 0 request_count 0 anomaly_label 0 dtype: int64
huwaei.duplicated()
0 False
1 False
2 False
3 False
4 False
...
20154 False
20155 False
20156 False
20157 False
20158 False
Length: 20159, dtype: bool
huwaei.duplicated().sum()
0
huwaei.dtypes
timestamp datetime64[ns, pytz.FixedOffset(120)] kpi_value float64 request_count int64 anomaly_label int64 dtype: object
kpi=huwaei['kpi_value']
timestamp=huwaei['timestamp']
label=huwaei['anomaly_label']
fig, ax2d = plt.subplots(figsize=(40,10))
ax2a = ax2d.twinx()
ax2d.plot(timestamp, kpi, color='blue')
ax2a.plot(timestamp, label, color='red', linewidth='1')
ax2d.set_xlabel('Time Stamp', fontsize=20)
ax2d.set_ylabel('KPI Value', fontsize=20)
ax2a.set_ylabel('Anomaly Label', fontsize=20)
plt.grid()
plt.title('Huwaei Dataset', fontsize=30)
Text(0.5, 1.0, 'Huwaei Dataset')
timestamp
arr=np.where(timestamp=='2020-08-24 00:00:00+02:00')
arr
(array([14280], dtype=int64),)
split1=timestamp[4200]
split2=timestamp[4800]
split3=timestamp[7080]
split4=timestamp[7680]
split5=timestamp[11400]
split6=timestamp[14280]
fig, ax2d = plt.subplots(figsize=(40,10))
ax2a = ax2d.twinx()
ax2d.plot(timestamp, kpi, color='blue')
ax2a.plot(timestamp, label, color='red', linewidth='2')
ax2d.set_xlabel('Time Stamp', fontsize=20)
ax2d.set_ylabel('KPI Value', fontsize=20)
ax2a.set_ylabel('Anomaly Label', fontsize=20)
ax2d.axvline(split1, color='green', linestyle='dashed', linewidth='2')
ax2d.axvline(split2, color='green', linestyle='dashed', linewidth='2')
ax2d.axvline(split3, color='green', linestyle='dashed', linewidth='2')
ax2d.axvline(split4, color='green', linestyle='dashed', linewidth='2')
ax2d.axvline(split5, color='green', linestyle='dashed', linewidth='2')
ax2d.axvline(split6, color='green', linestyle='dashed', linewidth='2')
plt.grid()
plt.title('Huwaei Dataset', fontsize=30)
print("Start date is: ", timestamp.min())
print("End date is: ", timestamp.max())
print(f"Length of full data: {len(huwaei)}")
Start date is: 2020-08-14 02:00:00+02:00 End date is: 2020-08-28 02:00:00+02:00 Length of full data: 20159
splits=[split1, split2, split3, split4, split5, split6]
splits
[Timestamp('2020-08-17 00:00:00+0200', tz='pytz.FixedOffset(120)'),
Timestamp('2020-08-17 10:00:00+0200', tz='pytz.FixedOffset(120)'),
Timestamp('2020-08-19 00:00:00+0200', tz='pytz.FixedOffset(120)'),
Timestamp('2020-08-19 10:00:00+0200', tz='pytz.FixedOffset(120)'),
Timestamp('2020-08-22 00:00:00+0200', tz='pytz.FixedOffset(120)'),
Timestamp('2020-08-24 00:00:00+0200', tz='pytz.FixedOffset(120)')]
# Training intervals
train1 = huwaei.loc[timestamp <= splits[0]]
train2 = huwaei.loc[(timestamp > splits[1]) & (timestamp <= splits[2])]
train3 = huwaei.loc[(timestamp > splits[3]) & (timestamp <= splits[4])]
train4 = huwaei.loc[timestamp > splits[5]]
# Testing intervals
test1 = huwaei.loc[(timestamp > splits[0]) & (timestamp <= splits[1])]
test2 = huwaei.loc[(timestamp > splits[2]) & (timestamp <= splits[3])]
test3 = huwaei.loc[(timestamp > splits[4]) & (timestamp <= splits[5])]
fig, ax2d = plt.subplots(figsize=(40,10))
ax2a = ax2d.twinx()
ax2d.plot(train1['timestamp'], train1['kpi_value'], color='blue')
ax2d.plot(test1['timestamp'], test1['kpi_value'], color='red')
ax2d.plot(train2['timestamp'], train2['kpi_value'], color='blue')
ax2d.plot(test2['timestamp'], test2['kpi_value'], color='red')
ax2d.plot(train3['timestamp'], train3['kpi_value'], color='blue')
ax2d.plot(test3['timestamp'], test3['kpi_value'], color='red')
ax2d.plot(train4['timestamp'], train4['kpi_value'], color='blue')
ax2a.plot(timestamp, label, color='green', linewidth='2')
ax2d.set_xlabel('Time Stamp', fontsize=20)
ax2d.set_ylabel('KPI Value', fontsize=20)
ax2a.set_ylabel('Anomaly Label', fontsize=20)
ax2d.axvline(split1, color='orange', linestyle='dashed', linewidth='2')
ax2d.axvline(split2, color='orange', linestyle='dashed', linewidth='2')
ax2d.axvline(split3, color='orange', linestyle='dashed', linewidth='2')
ax2d.axvline(split4, color='orange', linestyle='dashed', linewidth='2')
ax2d.axvline(split5, color='orange', linestyle='dashed', linewidth='2')
ax2d.axvline(split6, color='orange', linestyle='dashed', linewidth='2')
plt.grid()
plt.title('Huwaei Dataset', fontsize=30)
Text(0.5, 1.0, 'Huwaei Dataset')
# Preprocessing: normalize the data in mean and variance
scaler = StandardScaler()
scaler = scaler.fit(pd.concat([train1, train2, train3, train4])[['kpi_value', 'request_count']]) #To normalize the mean and variance
train1[['kpi_value', 'request_count']] = scaler.transform(train1[['kpi_value', 'request_count']])
train2[['kpi_value', 'request_count']] = scaler.transform(train2[['kpi_value', 'request_count']])
train3[['kpi_value', 'request_count']] = scaler.transform(train3[['kpi_value', 'request_count']])
train4[['kpi_value', 'request_count']] = scaler.transform(train4[['kpi_value', 'request_count']])
trainSeqs = [train1, train2, train3, train4]
# Avoid leakage of information between train and test dataset
test1[['kpi_value', 'request_count']] = scaler.transform(test1[['kpi_value', 'request_count']])
test2[['kpi_value', 'request_count']] = scaler.transform(test2[['kpi_value', 'request_count']])
test3[['kpi_value', 'request_count']] = scaler.transform(test3[['kpi_value', 'request_count']])
testSeqs = [test1, test2, test3]
# Compute the train to test ratio
learn_ratio = pd.concat(trainSeqs).shape[0] / huwaei.shape[0]
print("Learn dataset percentage: " "{:.2f}" "%".format(learn_ratio*100))
test_ratio = pd.concat(testSeqs).shape[0] / huwaei.shape[0]
print("Learn dataset percentage: " "{:.2f}" "%".format(test_ratio*100))
Learn dataset percentage: 79.76% Learn dataset percentage: 20.24%
trainSeqs
[ timestamp kpi_value request_count anomaly_label
0 2020-08-14 02:00:00+02:00 1.508552 1.765505 0
1 2020-08-14 02:01:00+02:00 1.243159 1.783991 0
2 2020-08-14 02:02:00+02:00 1.406095 1.644353 0
3 2020-08-14 02:03:00+02:00 1.373775 1.631520 0
4 2020-08-14 02:04:00+02:00 1.421287 1.594701 0
... ... ... ... ...
4196 2020-08-16 23:56:00+02:00 1.164309 1.031870 0
4197 2020-08-16 23:57:00+02:00 1.160132 1.069912 0
4198 2020-08-16 23:58:00+02:00 1.081624 0.943412 0
4199 2020-08-16 23:59:00+02:00 1.086746 0.860760 0
4200 2020-08-17 00:00:00+02:00 0.492590 9.208951 0
[4201 rows x 4 columns],
timestamp kpi_value request_count anomaly_label
4801 2020-08-17 10:01:00+02:00 -0.314337 -0.372455 0
4802 2020-08-17 10:02:00+02:00 -0.505045 -0.493149 0
4803 2020-08-17 10:03:00+02:00 -1.001756 -0.537760 0
4804 2020-08-17 10:04:00+02:00 -0.656774 -0.542191 0
4805 2020-08-17 10:05:00+02:00 -0.512424 -0.562968 0
... ... ... ... ...
7076 2020-08-18 23:56:00+02:00 1.274315 1.097412 0
7077 2020-08-18 23:57:00+02:00 1.178616 1.100773 0
7078 2020-08-18 23:58:00+02:00 1.222305 1.077245 0
7079 2020-08-18 23:59:00+02:00 1.317042 0.987412 0
7080 2020-08-19 00:00:00+02:00 0.122372 9.494949 0
[2280 rows x 4 columns],
timestamp kpi_value request_count anomaly_label
7681 2020-08-19 10:01:00+02:00 -0.771397 -0.360233 0
7682 2020-08-19 10:02:00+02:00 -0.809615 -0.384678 0
7683 2020-08-19 10:03:00+02:00 -0.715002 -0.382539 0
7684 2020-08-19 10:04:00+02:00 -0.774978 -0.450066 0
7685 2020-08-19 10:05:00+02:00 -0.977072 -0.497427 0
... ... ... ... ...
11396 2020-08-21 23:56:00+02:00 0.856348 0.400901 0
11397 2020-08-21 23:57:00+02:00 0.980600 0.401971 0
11398 2020-08-21 23:58:00+02:00 1.119249 0.345443 0
11399 2020-08-21 23:59:00+02:00 0.913249 0.303583 0
11400 2020-08-22 00:00:00+02:00 0.132003 7.406794 0
[3720 rows x 4 columns],
timestamp kpi_value request_count anomaly_label
14281 2020-08-24 00:01:00+02:00 -0.397133 4.243700 0
14282 2020-08-24 00:02:00+02:00 -0.526726 2.170211 0
14283 2020-08-24 00:03:00+02:00 -0.462762 1.790866 0
14284 2020-08-24 00:04:00+02:00 -0.609524 1.652603 0
14285 2020-08-24 00:05:00+02:00 0.370959 1.594701 0
... ... ... ... ...
20154 2020-08-28 01:56:00+02:00 1.358435 2.003684 0
20155 2020-08-28 01:57:00+02:00 1.405715 2.009032 0
20156 2020-08-28 01:58:00+02:00 1.411597 2.067851 0
20157 2020-08-28 01:59:00+02:00 1.427468 2.041115 0
20158 2020-08-28 02:00:00+02:00 1.174925 2.496696 0
[5878 rows x 4 columns]]
testSeqs
[ timestamp kpi_value request_count anomaly_label
4201 2020-08-17 00:01:00+02:00 -0.298217 4.215895 0
4202 2020-08-17 00:02:00+02:00 -0.174642 2.022476 0
4203 2020-08-17 00:03:00+02:00 0.387258 1.556812 0
4204 2020-08-17 00:04:00+02:00 0.446124 1.490659 0
4205 2020-08-17 00:05:00+02:00 0.582078 1.358966 0
... ... ... ... ...
4796 2020-08-17 09:56:00+02:00 -0.072083 -0.438761 0
4797 2020-08-17 09:57:00+02:00 -0.350213 -0.522177 0
4798 2020-08-17 09:58:00+02:00 -0.672569 -0.582065 0
4799 2020-08-17 09:59:00+02:00 -0.563406 -0.626371 0
4800 2020-08-17 10:00:00+02:00 -0.304437 -0.418136 0
[600 rows x 4 columns],
timestamp kpi_value request_count anomaly_label
7081 2020-08-19 00:01:00+02:00 -0.759512 4.386852 0
7082 2020-08-19 00:02:00+02:00 0.168237 2.064948 0
7083 2020-08-19 00:03:00+02:00 0.329986 1.596228 0
7084 2020-08-19 00:04:00+02:00 0.338085 1.513882 0
7085 2020-08-19 00:05:00+02:00 0.695776 1.433979 0
... ... ... ... ...
7676 2020-08-19 09:56:00+02:00 -0.175670 -0.427150 0
7677 2020-08-19 09:57:00+02:00 -0.606715 -0.433261 0
7678 2020-08-19 09:58:00+02:00 -0.454601 -0.492080 0
7679 2020-08-19 09:59:00+02:00 -0.918436 -0.478177 0
7680 2020-08-19 10:00:00+02:00 -0.499805 -0.326470 0
[600 rows x 4 columns],
timestamp kpi_value request_count anomaly_label
11401 2020-08-22 00:01:00+02:00 -1.145829 3.076637 0
11402 2020-08-22 00:02:00+02:00 -0.777070 1.166314 0
11403 2020-08-22 00:03:00+02:00 -0.110366 0.824094 0
11404 2020-08-22 00:04:00+02:00 -0.140076 0.722650 0
11405 2020-08-22 00:05:00+02:00 0.316230 0.681094 0
... ... ... ... ...
14276 2020-08-23 23:56:00+02:00 0.106058 1.186481 0
14277 2020-08-23 23:57:00+02:00 0.795930 1.293424 0
14278 2020-08-23 23:58:00+02:00 0.695386 1.230480 0
14279 2020-08-23 23:59:00+02:00 -0.039992 1.115898 0
14280 2020-08-24 00:00:00+02:00 0.334182 9.460269 0
[2880 rows x 4 columns]]
TIME_STEPS = 32
# Create the sequences of size TIME_STEPS to feed the sequential model
def to_sequences(x, y, TIME_STEPS=1):
x_values = []
y_values = []
for i in range(len(x)-TIME_STEPS):
x_values.append(x.iloc[i:(i+TIME_STEPS)].values)
y_values.append(y.iloc[i+1:(i+TIME_STEPS+1)])
return np.array(x_values), np.array(y_values)
# Stack the sequences obtained in each period to create a single training tensor
trainX = np.array([])
trainY = np.array([])
testX = np.array([])
testY = np.array([])
for train in trainSeqs:
x, y = to_sequences(train[['kpi_value']], train[['kpi_value']], TIME_STEPS)
trainX = np.vstack([trainX, x]) if trainX.size else x
trainY = np.vstack([trainY, y]) if trainY.size else y
for test in testSeqs:
x, y = to_sequences(test[['kpi_value']], test[['kpi_value']], TIME_STEPS)
testX = np.vstack([testX, x]) if testX.size else x
testY = np.vstack([testY, y]) if testY.size else y
print(f"Train X shape: {trainX.shape[0]} batches, {trainX.shape[1]} values for each batch, {trainX.shape[2]} dimensional batch")
print(f"Train Y shape: {trainY.shape[0]} batches, {trainY.shape[1]} values for each batch, {trainY.shape[2]} dimensional batch")
print(f"Test X shape: {testX.shape[0]} batches, {testX.shape[1]} values for each batch, {testX.shape[2]} dimensional batch")
print(f"Test Y shape: {testY.shape[0]} batches, {testY.shape[1]} values for each batch, {testY.shape[2]} dimensional batch")
# 15951 + 3984 + (32*7) = 20159
# 32 values for each sample
# 3984 testing batches
Train X shape: 15951 batches, 32 values for each batch, 1 dimensional batch Train Y shape: 15951 batches, 32 values for each batch, 1 dimensional batch Test X shape: 3984 batches, 32 values for each batch, 1 dimensional batch Test Y shape: 3984 batches, 32 values for each batch, 1 dimensional batch
print(trainX[0])
print("")
print(trainX[1])
[[1.50855227] [1.24315929] [1.40609472] [1.37377529] [1.42128722] [1.39961253] [1.44074339] [1.49781123] [1.58274419] [1.4783101 ] [1.47234347] [1.40898944] [1.283624 ] [1.47115683] [1.47085535] [1.46514692] [1.38556393] [1.34201061] [1.48229737] [1.51873494] [1.4757048 ] [1.54065301] [1.49302045] [1.49090656] [1.52805407] [1.47927088] [1.44423395] [1.54526938] [1.55088521] [1.45952541] [1.44717635] [1.57334216]] [[1.24315929] [1.40609472] [1.37377529] [1.42128722] [1.39961253] [1.44074339] [1.49781123] [1.58274419] [1.4783101 ] [1.47234347] [1.40898944] [1.283624 ] [1.47115683] [1.47085535] [1.46514692] [1.38556393] [1.34201061] [1.48229737] [1.51873494] [1.4757048 ] [1.54065301] [1.49302045] [1.49090656] [1.52805407] [1.47927088] [1.44423395] [1.54526938] [1.55088521] [1.45952541] [1.44717635] [1.57334216] [1.57028136]]
print(trainX[0])
print("")
print(trainY[0])
[[1.50855227] [1.24315929] [1.40609472] [1.37377529] [1.42128722] [1.39961253] [1.44074339] [1.49781123] [1.58274419] [1.4783101 ] [1.47234347] [1.40898944] [1.283624 ] [1.47115683] [1.47085535] [1.46514692] [1.38556393] [1.34201061] [1.48229737] [1.51873494] [1.4757048 ] [1.54065301] [1.49302045] [1.49090656] [1.52805407] [1.47927088] [1.44423395] [1.54526938] [1.55088521] [1.45952541] [1.44717635] [1.57334216]] [[1.24315929] [1.40609472] [1.37377529] [1.42128722] [1.39961253] [1.44074339] [1.49781123] [1.58274419] [1.4783101 ] [1.47234347] [1.40898944] [1.283624 ] [1.47115683] [1.47085535] [1.46514692] [1.38556393] [1.34201061] [1.48229737] [1.51873494] [1.4757048 ] [1.54065301] [1.49302045] [1.49090656] [1.52805407] [1.47927088] [1.44423395] [1.54526938] [1.55088521] [1.45952541] [1.44717635] [1.57334216] [1.57028136]]
model = Sequential()
model.add(layers.LSTM(128, input_shape=(trainX.shape[1], trainX.shape[2])))
model.add(layers.Dropout(rate=0.2))
model.add(layers.RepeatVector(trainX.shape[1]))
model.add(layers.LSTM(128, return_sequences=True))
model.add(layers.Dropout(rate=0.2))
model.add(layers.TimeDistributed(layers.Dense(1)))
model.compile(optimizer='adam', loss='mae')
model.summary()
Model: "sequential_1" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= lstm_2 (LSTM) (None, 128) 66560 _________________________________________________________________ dropout_2 (Dropout) (None, 128) 0 _________________________________________________________________ repeat_vector_1 (RepeatVecto (None, 32, 128) 0 _________________________________________________________________ lstm_3 (LSTM) (None, 32, 128) 131584 _________________________________________________________________ dropout_3 (Dropout) (None, 32, 128) 0 _________________________________________________________________ time_distributed_1 (TimeDist (None, 32, 1) 129 ================================================================= Total params: 198,273 Trainable params: 198,273 Non-trainable params: 0 _________________________________________________________________
# Fit model
history = model.fit(trainX, trainY, epochs=50, batch_size=32, validation_split=0.1, verbose=1, callbacks=[
keras.callbacks.EarlyStopping(monitor="val_loss", patience=5, mode="min")
])
Epoch 1/50 449/449 [==============================] - 86s 161ms/step - loss: 0.2975 - val_loss: 0.1932 Epoch 2/50 449/449 [==============================] - 64s 143ms/step - loss: 0.2363 - val_loss: 0.1781 Epoch 3/50 449/449 [==============================] - 65s 144ms/step - loss: 0.2259 - val_loss: 0.1740 Epoch 4/50 449/449 [==============================] - 78s 173ms/step - loss: 0.2226 - val_loss: 0.1727 Epoch 5/50 449/449 [==============================] - 75s 168ms/step - loss: 0.2216 - val_loss: 0.1675 Epoch 6/50 449/449 [==============================] - 70s 157ms/step - loss: 0.2185 - val_loss: 0.1654 Epoch 7/50 449/449 [==============================] - 70s 156ms/step - loss: 0.2155 - val_loss: 0.1682 Epoch 8/50 449/449 [==============================] - 75s 168ms/step - loss: 0.2165 - val_loss: 0.1627 Epoch 9/50 449/449 [==============================] - 74s 165ms/step - loss: 0.2128 - val_loss: 0.1604 Epoch 10/50 449/449 [==============================] - 74s 166ms/step - loss: 0.2100 - val_loss: 0.1561 Epoch 11/50 449/449 [==============================] - 78s 174ms/step - loss: 0.2090 - val_loss: 0.1573 Epoch 12/50 449/449 [==============================] - 90s 202ms/step - loss: 0.2074 - val_loss: 0.1610 Epoch 13/50 449/449 [==============================] - 76s 169ms/step - loss: 0.2067 - val_loss: 0.1540 Epoch 14/50 449/449 [==============================] - 77s 172ms/step - loss: 0.2044 - val_loss: 0.1608 Epoch 15/50 449/449 [==============================] - 75s 166ms/step - loss: 0.2033 - val_loss: 0.1535 Epoch 16/50 449/449 [==============================] - 79s 175ms/step - loss: 0.2028 - val_loss: 0.1528 Epoch 17/50 449/449 [==============================] - 77s 172ms/step - loss: 0.2008 - val_loss: 0.1518 Epoch 18/50 449/449 [==============================] - 77s 171ms/step - loss: 0.2017 - val_loss: 0.1515 Epoch 19/50 449/449 [==============================] - 76s 169ms/step - loss: 0.2006 - val_loss: 0.1531 Epoch 20/50 449/449 [==============================] - 73s 163ms/step - loss: 0.2011 - val_loss: 0.1553 Epoch 21/50 449/449 [==============================] - 74s 165ms/step - loss: 0.2001 - val_loss: 0.1495 Epoch 22/50 449/449 [==============================] - 74s 165ms/step - loss: 0.1992 - val_loss: 0.1520 Epoch 23/50 449/449 [==============================] - 74s 164ms/step - loss: 0.1987 - val_loss: 0.1526 Epoch 24/50 449/449 [==============================] - 74s 164ms/step - loss: 0.1961 - val_loss: 0.1485 Epoch 25/50 449/449 [==============================] - 73s 163ms/step - loss: 0.1958 - val_loss: 0.1507 Epoch 26/50 449/449 [==============================] - 72s 161ms/step - loss: 0.1978 - val_loss: 0.1478 Epoch 27/50 449/449 [==============================] - 71s 159ms/step - loss: 0.1967 - val_loss: 0.1485 Epoch 28/50 449/449 [==============================] - 73s 163ms/step - loss: 0.1954 - val_loss: 0.1484 Epoch 29/50 449/449 [==============================] - 72s 161ms/step - loss: 0.1938 - val_loss: 0.1477 Epoch 30/50 449/449 [==============================] - 72s 161ms/step - loss: 0.1935 - val_loss: 0.1492 Epoch 31/50 449/449 [==============================] - 73s 163ms/step - loss: 0.1939 - val_loss: 0.1455 Epoch 32/50 449/449 [==============================] - 72s 161ms/step - loss: 0.1940 - val_loss: 0.1455 Epoch 33/50 449/449 [==============================] - 75s 168ms/step - loss: 0.1925 - val_loss: 0.1458 Epoch 34/50 449/449 [==============================] - 72s 159ms/step - loss: 0.1917 - val_loss: 0.1447 Epoch 35/50 449/449 [==============================] - 74s 164ms/step - loss: 0.1907 - val_loss: 0.1434 Epoch 36/50 449/449 [==============================] - 76s 170ms/step - loss: 0.1906 - val_loss: 0.1434 Epoch 37/50 449/449 [==============================] - 74s 166ms/step - loss: 0.1907 - val_loss: 0.1430 Epoch 38/50 449/449 [==============================] - 74s 166ms/step - loss: 0.1896 - val_loss: 0.1443 Epoch 39/50 449/449 [==============================] - 80s 178ms/step - loss: 0.1889 - val_loss: 0.1416 Epoch 40/50 449/449 [==============================] - 72s 161ms/step - loss: 0.1898 - val_loss: 0.1439 Epoch 41/50 449/449 [==============================] - 73s 163ms/step - loss: 0.1877 - val_loss: 0.1419 Epoch 42/50 449/449 [==============================] - 72s 161ms/step - loss: 0.1897 - val_loss: 0.1429 Epoch 43/50 449/449 [==============================] - 73s 162ms/step - loss: 0.1895 - val_loss: 0.1420 Epoch 44/50 449/449 [==============================] - 73s 163ms/step - loss: 0.1870 - val_loss: 0.1426
model.evaluate(testX,testY)
125/125 [==============================] - 9s 69ms/step - loss: 0.1840
0.18398800492286682
# Plot the training and validation loss
plt.figure(figsize=(40,10))
plt.plot(history.history['loss'], label='Training loss')
plt.plot(history.history['val_loss'], label='Validation loss')
plt.legend()
<matplotlib.legend.Legend at 0x15c23445640>
# Plot a histogram of the reconstruction error in the training dataset to decide a threshold
trainPredict = model.predict(trainX)
trainMAE = np.mean(np.abs(trainPredict - trainY), axis=1)
plt.figure(figsize=(40,10));
plt.hist(trainMAE, bins=30);
plt.legend(['kpi_value_prediction_error'])
threshold_trainMAE = 0.3 #or Define 90% value of max as threshold.
# Histogram of the testing MAE
testPredict = model.predict(testX)
testMAE = np.mean(np.abs(testPredict - testY), axis=1)
plt.figure(figsize=(40,10));
plt.hist(testMAE, bins=30);
# Detect anomaly if the reconstruction loss for a sample is greater than the threshold
anomaly_df = pd.concat([seq[TIME_STEPS:] for seq in testSeqs])
anomaly_df['testMAE'] = testMAE
anomaly_df['threshold_trainMAE'] = threshold_trainMAE
anomaly_df['anomaly'] = anomaly_df['testMAE'] > anomaly_df['threshold_trainMAE']
plt.figure(figsize=(40,10))
for i in range(trainPredict.shape[0]):
plt.plot(trainPredict[i])
plt.figure(figsize=(40,10))
for i in range(testPredict.shape[0]):
plt.plot(testPredict[i])
testPredict.shape[0]
3984
# Plot the test MAE
plt.figure(figsize=[40, 10])
anomaly_df['threshold_trainMAE'].plot()
anomaly_df['testMAE'].plot()
plt.legend()
<matplotlib.legend.Legend at 0x15c44511280>
# Detect the anomaly points inside the dataset
anomalies = anomaly_df.loc[anomaly_df['anomaly'] == True]
#Plot the anomalies
plt.figure(figsize=(40,10))
anomaly_df['kpi_value'].plot()
anomaly_df['anomaly_label'].plot()
anomalies['kpi_value'].plot(marker='.', linestyle='None', label='anomaly_detected', color='black')
plt.legend()
<matplotlib.legend.Legend at 0x15c438887c0>
# Inspect the resulting dataset
anomaly_df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 3984 entries, 4233 to 14280 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 timestamp 3984 non-null datetime64[ns, pytz.FixedOffset(120)] 1 kpi_value 3984 non-null float64 2 request_count 3984 non-null float64 3 anomaly_label 3984 non-null int64 4 testMAE 3984 non-null float64 5 threshold_trainMAE 3984 non-null float64 6 anomaly 3984 non-null bool dtypes: bool(1), datetime64[ns, pytz.FixedOffset(120)](1), float64(4), int64(1) memory usage: 221.8 KB
anomaly_df
| timestamp | kpi_value | request_count | anomaly_label | testMAE | threshold_trainMAE | anomaly | |
|---|---|---|---|---|---|---|---|
| 4233 | 2020-08-17 00:33:00+02:00 | 1.046498 | 1.249119 | 0 | 0.091636 | 0.3 | False |
| 4234 | 2020-08-17 00:34:00+02:00 | 0.907252 | 1.161120 | 0 | 0.079620 | 0.3 | False |
| 4235 | 2020-08-17 00:35:00+02:00 | 1.017913 | 1.226050 | 0 | 0.088194 | 0.3 | False |
| 4236 | 2020-08-17 00:36:00+02:00 | 1.241087 | 1.310383 | 0 | 0.092561 | 0.3 | False |
| 4237 | 2020-08-17 00:37:00+02:00 | 1.232323 | 1.260119 | 0 | 0.089862 | 0.3 | False |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 14276 | 2020-08-23 23:56:00+02:00 | 0.106058 | 1.186481 | 0 | 0.144077 | 0.3 | False |
| 14277 | 2020-08-23 23:57:00+02:00 | 0.795930 | 1.293424 | 0 | 0.149961 | 0.3 | False |
| 14278 | 2020-08-23 23:58:00+02:00 | 0.695386 | 1.230480 | 0 | 0.143180 | 0.3 | False |
| 14279 | 2020-08-23 23:59:00+02:00 | -0.039992 | 1.115898 | 0 | 0.164715 | 0.3 | False |
| 14280 | 2020-08-24 00:00:00+02:00 | 0.334182 | 9.460269 | 0 | 0.169704 | 0.3 | False |
3984 rows × 7 columns
# Compute the precision in the test dataset
precision_score(anomaly_df['anomaly_label'], anomaly_df['anomaly'])
0.7029850746268657
# Compute the recall in the test dataset
recall_score(anomaly_df['anomaly_label'], anomaly_df['anomaly'])
0.6826086956521739
# Compute the F1 score in the test dataset
f1_score(anomaly_df['anomaly_label'], anomaly_df['anomaly'])
0.6926470588235294